import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
# Preprocessing
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
# Model
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
# Feature Engineering
from sklearn.feature_selection import VarianceThreshold, RFE
# Hyperparameters tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, roc_auc_score
from sklearn.model_selection import cross_val_score
# XAI
import dalex as dx
import shap
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('data/census_income_dataset.csv')
df.head()
| age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income_level | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516.0 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174.0 | 0.0 | 40.0 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311.0 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0.0 | 0.0 | 13.0 | United-States | <=50K |
| 2 | 38 | Private | 215646.0 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0.0 | 0.0 | 40.0 | United-States | <=50K |
| 3 | 53 | Private | 234721.0 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0.0 | 0.0 | 40.0 | United-States | <=50K |
| 4 | 28 | Private | 338409.0 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0.0 | 0.0 | 40.0 | Cuba | <=50K |
# zmienne kategoryczne
df.replace('?', np.nan, inplace=True)
# zmienne numeryczne
df.replace(-100000, np.nan, inplace=True)
Zmienna native_country ma kategorię, która ma znacznie więcej obserwacji niż inne kategorie, więc możemy tą wartością zastąpić braki danych.
df['native_country'].fillna(value="United-States", inplace=True)
W przypadku zmiennych workclass oraz occupation skorzystajmy z imputacji LOCF.
df['workclass'].fillna(method = 'ffill', inplace = True)
df['occupation'].fillna(method = 'ffill', inplace = True)
X = df.drop(columns=['income_level'])
y = df['income_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
dc = DummyClassifier(strategy='most_frequent')
dc.fit(X_train, y_train)
preds = dc.predict(X_test)
print(f'Accuracy for dummy model: {accuracy_score(y_test, preds)}')
Accuracy for dummy model: 0.7606715119254785
def show_metrics(model, X, y):
"""
Prints out the most important metrics
"""
y_pred = model.predict(X)
sns.heatmap(confusion_matrix(y, y_pred), annot=True)
plt.show()
print('Accuracy: ', accuracy_score(y, y_pred))
print('Precision: ', precision_score(y, y_pred))
print('Recall: ', recall_score(y, y_pred))
print('F1 score: ', f1_score(y, y_pred))
def gini_train_val(model, X_train, y_train, X_val, y_val):
y_pred_proba = model.predict_proba(X_train)[::,1]
fpr, tpr, _ = roc_curve(y_train, y_pred_proba)
plt.plot(fpr, tpr, label='X_train')
roc_auc = auc(fpr, tpr)
gini_train = (2 * roc_auc) - 1
y_pred_proba = model.predict_proba(X_val)[::,1]
fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
plt.plot(fpr, tpr, label='X_val')
roc_auc = auc(fpr, tpr)
gini_val = (2 * roc_auc) - 1
plt.plot([0, 1], [0, 1], "--")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title("ROC Curve")
plt.legend()
plt.show()
print("gini_train: %.4f" % gini_train)
print("gini_val: %.4f" % gini_val)
return
def gini_compare_models(model1, model2, X1, y1, X2, y2):
y_pred_proba = model1.predict_proba(X1)[::,1]
fpr, tpr, _ = roc_curve(y1, y_pred_proba)
plt.plot(fpr, tpr, label='model1')
roc_auc = auc(fpr, tpr)
gini1 = (2 * roc_auc) - 1
y_pred_proba = model2.predict_proba(X2)[::,1]
fpr, tpr, _ = roc_curve(y2, y_pred_proba)
plt.plot(fpr, tpr, label='model2')
roc_auc = auc(fpr, tpr)
gini2 = (2 * roc_auc) - 1
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()
plt.show()
print("Gini for model 1: %.4f" % gini1)
print("Gini for model 2: %.4f" % gini2)
Zmienna fnlwgt to przypisane do poszczególnych obserwacji wagi odzwierciedlającę wielkość populacji, z której próbkowane były poszczególne obserwacje. Zmienna ta nie będzie przydatna przy modelowaniu, dlatego usuwamy ją.
df.drop(columns=['fnlwgt'], inplace=True)
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
base_transformer = ColumnTransformer(
[
('one_hot', OneHotEncoder(handle_unknown='ignore'), cat_features),
],
remainder='passthrough'
)
base_models = [
('lr_model', LogisticRegression(random_state=123, n_jobs=-1)),
('svm_model', SVC(random_state=123)),
('dt_model', DecisionTreeClassifier(random_state=123)),
('rf_model', RandomForestClassifier(random_state=123)),
('xgb_model', (XGBClassifier(random_state=123)))
]
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
for name, model in base_models:
pipe = Pipeline(
[
('transformer', base_transformer),
('model', model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'{name} cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
lr_model cross validation accuracy score: 0.8325952680041286 +/- 0.010626215579691066 min: 0.8170185540627 max: 0.8463211772232886 svm_model cross validation accuracy score: 0.8026257774118374 +/- 0.002088968710590899 min: 0.799334527770668 max: 0.8052463211772233 dt_model cross validation accuracy score: 0.8211552242877878 +/- 0.002345955530738737 min: 0.818914768364474 max: 0.8253358925143954 rf_model cross validation accuracy score: 0.8465434860179271 +/- 0.00468027513949076 min: 0.8383670335295623 max: 0.8510556621880998 xgb_model cross validation accuracy score: 0.8722647262218297 +/- 0.000840029306686614 min: 0.8708893154190659 max: 0.8733205374280231
def gini_score(y_true, y_score):
return (2 * roc_auc_score(y_true, y_score)) - 1
test_results = []
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
base_transformer = ColumnTransformer(
[
('one_hot', OneHotEncoder(handle_unknown='ignore'), cat_features),
],
remainder='passthrough'
)
base_models = [
('lr_model', LogisticRegression(random_state=123, n_jobs=-1)),
('dt_model', DecisionTreeClassifier(random_state=123)),
('rf_model', RandomForestClassifier(random_state=123)),
('xgb_model', (XGBClassifier(random_state=123)))
]
for name, model in base_models:
pipe = Pipeline(
[
('transformer', base_transformer),
('model', model)
]
)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_pred_proba = pipe.predict_proba(X_test)[::, 1]
measures_results = {"model": name}
for measure in [accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score]:
measures_results[measure.__name__] = measure(y_test, y_pred)
print(f'{name} {measure.__name__}: {measure(y_test, y_pred)}')
for measure in [gini_score, roc_auc_score]:
measures_results[measure.__name__] = measure(y_test, y_pred_proba)
print(f'{name} {measure.__name__}: {measure(y_test, y_pred_proba)}')
test_results.append(measures_results)
print('----------')
lr_model accuracy_score: 0.8382638959975433 lr_model balanced_accuracy_score: 0.7399342557171832 lr_model precision_score: 0.7082417582417583 lr_model recall_score: 0.5513259195893926 lr_model f1_score: 0.6200096200096199 lr_model gini_score: 0.7542867434287661 lr_model roc_auc_score: 0.8771433717143831 ---------- dt_model accuracy_score: 0.8261848705087522 dt_model balanced_accuracy_score: 0.7488503585711672 dt_model precision_score: 0.6476014760147601 dt_model recall_score: 0.6005132591958939 dt_model f1_score: 0.6231691078561917 dt_model gini_score: 0.5484344190101829 dt_model roc_auc_score: 0.7742172095050914 ---------- rf_model accuracy_score: 0.8483979936533934 rf_model balanced_accuracy_score: 0.7641842159155937 rf_model precision_score: 0.7185109637939827 rf_model recall_score: 0.6026518391787853 rf_model f1_score: 0.6555012793672947 rf_model gini_score: 0.7797038715693936 rf_model roc_auc_score: 0.8898519357846968 ---------- [00:45:34] WARNING: D:\bld\xgboost-split_1637426510059\work\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. xgb_model accuracy_score: 0.8736820554816256 xgb_model balanced_accuracy_score: 0.7941418621894569 xgb_model precision_score: 0.7911392405063291 xgb_model recall_score: 0.6415739948674081 xgb_model f1_score: 0.7085498346717053 xgb_model gini_score: 0.8557686518651952 xgb_model roc_auc_score: 0.9278843259325976 ----------
test_results_df = pd.DataFrame(test_results)
test_results_df = pd.melt(test_results_df, id_vars="model", var_name="measure", value_name="value")
sns.catplot(x='measure', y='value', hue='model', data=test_results_df, kind='bar', aspect=3)
plt.show()
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('age_discretizer', KBinsDiscretizer(n_bins=15, encode='ordinal'), ['age']),
('discretizer', KBinsDiscretizer(n_bins=5, encode='ordinal'), ['capital_loss', 'capital_gain', 'hours_per_week']),
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features)
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8407851091950549 +/- 0.0018503125137839295 min: 0.8373432300998208 max: 0.8428662827895074
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
('log', FunctionTransformer(np.log1p), ['capital_loss']),
('std_scaler', StandardScaler(), ['age', 'capital_loss']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'hours_per_week'])
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8734164101017237 +/- 0.001380531779514911 min: 0.8714011516314779 max: 0.8754958413307742
def group_marital_status(df: pd.DataFrame) -> pd.DataFrame:
df_res = df.copy()
df_res['marital_status'] = df_res['marital_status'].replace(
['Married-civ-spouse', 'Married-AF-spouse'], 'Married'
)
df_res['marital_status'] = df_res['marital_status'].replace(
['Never-married', 'Divorced', 'Separated', 'Widowed', 'Married-spouse-absent'], 'Living-alone'
)
return df_res
df2 = group_marital_status(df)
X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
('std_scaler', StandardScaler(), ['age']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8732116887172505 +/- 0.0021027070782437133 min: 0.8707613563659629 max: 0.8769033909149072
def create_living_alone(df: pd.DataFrame) -> pd.DataFrame:
df_res = df.copy()
df_res['living_alone'] = df_res['marital_status'].replace(
['Married-civ-spouse', 'Married-AF-spouse'], 'Married'
)
df_res['living_alone'] = df_res['living_alone'].replace(
['Never-married', 'Divorced', 'Separated', 'Widowed', 'Married-spouse-absent'], 'Living-alone'
)
return df_res
df2 = create_living_alone(df)
X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'living_alone']
transformer = ColumnTransformer(
[
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
('std_scaler', StandardScaler(), ['age']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8724950492422922 +/- 0.001137462079053026 min: 0.8710007678525723 max: 0.8742162507997441
def group_race(df: pd.DataFrame) -> pd.DataFrame:
df_res = df.copy()
df_res['marital_status'] = df_res['marital_status'].replace(
['Asian-Pac-Islander', 'Amer-Indian-Eskimo'], 'Other'
)
return df_res
df2 = group_race(df)
X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
('std_scaler', StandardScaler(), ['age']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8734164101017237 +/- 0.001380531779514911 min: 0.8714011516314779 max: 0.8754958413307742
def group_education(df: pd.DataFrame) -> pd.DataFrame:
df_res = df.copy()
df_res['education'] = df_res['education'].replace(
['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th'], 'Obligatory'
)
df_res['education'] = df_res['education'].replace(['HS-grad', 'Some-college'], 'HS-college')
df_res['education'] = df_res['education'].replace(['Assoc-voc', 'Assoc-acdm'], 'Assoc')
df_res['education'] = df_res['education'].replace(['Prof-school', 'Doctorate'], 'Academic')
return df_res
df2 = group_education(df)
X = df2.drop(columns=['income_level', 'education_num'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
('ordinal', OrdinalEncoder(), ['education']),
('std_scaler', StandardScaler(), ['age']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.872648580455278 +/- 0.0012443643478918457 min: 0.870744816995137 max: 0.8743442098528471
def group_countries(df: pd.DataFrame) -> pd.DataFrame:
df_res = df.copy()
df_res['native_country'] = df_res['native_country'].apply(lambda x: x if x == "United-States" or x == "Mexico" else "Other")
return df_res
df2 = group_countries(df)
X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
('std_scaler', StandardScaler(), ['age']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8724438295946986 +/- 0.0014491703431791297 min: 0.870744816995137 max: 0.8747280870121561
def group_gdp(df: pd.DataFrame, gdp: pd.DataFrame) -> pd.DataFrame:
df_res = df.copy()
low = gdp[gdp['Country Name'] == 'Low income']['gdp'].values[0]
low_middle = gdp[gdp['Country Name'] == 'Lower middle income']['gdp'].values[0]
upper_middle = gdp[gdp['Country Name'] == 'Upper middle income']['gdp'].values[0]
high = gdp[gdp['Country Name'] == 'High income']['gdp'].values[0]
conditions = [
(df_res["gdp"].lt(low)),
(df_res["gdp"].ge(low) & df_res["gdp"].lt(low_middle)),
(df_res["gdp"].ge(low_middle) & df_res["gdp"].lt(upper_middle)),
(df_res["gdp"].ge(upper_middle) & df_res["gdp"].lt(high)),
(df_res["gdp"].ge(high)),
]
choices = ["Low income", "Lower middle income", "Middle income", "Upper middle income", "High income"]
df_res["gdp_level"] = np.select(conditions, choices)
return df_res
gdp = pd.read_csv('data/gdp.csv')
merged = df.merge(gdp, how='left', left_on='native_country', right_on='Country Name')
merged.drop(columns=['Country Name', 'Country Code'], inplace=True)
merged.fillna(method='ffill', inplace=True)
df2 = group_gdp(merged, gdp)
X = df2.drop(columns=['income_level', 'gdp', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'gdp_level', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
('std_scaler', StandardScaler(), ['age']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8725205984763147 +/- 0.001852653585169723 min: 0.870744816995137 max: 0.8754958413307742
gdp = pd.read_csv('data/gdp.csv')
merged = df.merge(gdp, how='left', left_on='native_country', right_on='Country Name')
merged.drop(columns=['Country Name', 'Country Code'], inplace=True)
merged.fillna(method='ffill', inplace=True)
df2 = group_gdp(merged, gdp)
df2 = group_countries(df2)
df2 = group_education(df2)
df2 = group_race(df2)
df2 = create_living_alone(df2)
X = df2.drop(columns=['income_level', 'gdp', 'education_num'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)
cat_features = ['workclass', 'marital_status', 'living_alone', 'gdp_level', 'occupation', 'relationship', 'race', 'sex', 'native_country']
transformer = ColumnTransformer(
[
('log', FunctionTransformer(np.log1p), ['capital_loss']),
('std_scaler', StandardScaler(), ['age', 'capital_loss']),
('min_max_scaler', MinMaxScaler(), ['capital_gain', 'hours_per_week']),
('ordinal', OrdinalEncoder(), ['education']),
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features)
],
remainder='passthrough'
)
xgb_model = XGBClassifier(random_state=123, n_jobs=-1)
kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
[
('transformer', transformer),
('model', xgb_model)
]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8729301067477193 +/- 0.0013467930057077492 min: 0.8708727924238546 max: 0.8751119641714651
params_distr = {
"model__n_estimators": [50, 100, 200],
"model__max_depth": [2, 3, 5, 10, 15],
"model__learning_rate": [0.5, 0.3, 0.1, 0.01],
}
search = RandomizedSearchCV(estimator=pipe, param_distributions=params_distr, verbose=True, n_jobs=-1)
search.fit(X, y)
Fitting 5 folds for each of 10 candidates, totalling 50 fits [10:43:18] WARNING: D:\bld\xgboost-split_1637426510059\work\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
RandomizedSearchCV(estimator=Pipeline(steps=[('transformer',
ColumnTransformer(remainder='passthrough',
transformers=[('log',
FunctionTransformer(func=<ufunc 'log1p'>),
['capital_loss']),
('std_scaler',
StandardScaler(),
['age',
'capital_loss']),
('min_max_scaler',
MinMaxScaler(),
['capital_gain',
'hours_per_week']),
('ordinal',
OrdinalEncoder(),
['education']),
('...
n_estimators=100,
n_jobs=-1,
num_parallel_tree=None,
predictor=None,
random_state=123,
reg_alpha=None,
reg_lambda=None,
scale_pos_weight=None,
subsample=None,
tree_method=None,
validate_parameters=None,
verbosity=None))]),
n_jobs=-1,
param_distributions={'model__learning_rate': [0.5, 0.3, 0.1,
0.01],
'model__max_depth': [2, 3, 5, 10, 15],
'model__n_estimators': [50, 100, 200]},
verbose=True)
best_random = search.best_estimator_
print(best_random)
Pipeline(steps=[('transformer',
ColumnTransformer(remainder='passthrough',
transformers=[('log',
FunctionTransformer(func=<ufunc 'log1p'>),
['capital_loss']),
('std_scaler',
StandardScaler(),
['age', 'capital_loss']),
('min_max_scaler',
MinMaxScaler(),
['capital_gain',
'hours_per_week']),
('ordinal', OrdinalEncoder(),
['education']),
('one_hot_encoder',
OneHotEncode...
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.5,
max_delta_step=0, max_depth=3,
min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=200,
n_jobs=-1, num_parallel_tree=1, predictor='auto',
random_state=123, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1,
verbosity=None))])
print(search.best_params_)
{'model__n_estimators': 200, 'model__max_depth': 3, 'model__learning_rate': 0.5}
show_metrics(best_random, X_test, y_test)
Accuracy: 0.8822806837956803 Precision: 0.8146186440677966 Recall: 0.6578272027373824 F1 score: 0.7278750591575959
gini_train_val(best_random, X_train, y_train, X_test, y_test)
gini_train: 0.8791 gini_val: 0.8771
explainer = dx.Explainer(best_random, X_train, y_train)
Preparation of a new explainer is initiated -> data : 39073 rows 14 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 39073 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000001C77059FB80> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 1.15e-07, mean = 0.24, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.975, mean = -0.000513, max = 0.999 -> model_info : package sklearn A new explainer has been created!
explainer.model_parts().plot()
explainer.model_profile().plot()
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 14/14 [00:03<00:00, 4.09it/s]
explainer.model_profile(variables=['age']).plot(geom='profiles')
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.19it/s]